

#Set directory
setwd('./data')

######################
#2004 dapil-kecamatan#
######################
kec_dprd_2004 = fread("./crosswalks/kecamatan_to_dprd2_2004.csv")
setnames(kec_dprd_2004, 'DAPIL.NUMBER', "DAPIL_NUMBER")
kec_dprd_2004[KECA %in% 1, KECA := 10]
kec_dprd_2004[, id_kec := paste0(PROP, sprintf("%02.f", as.numeric(KABU)), sprintf("%03.f", as.numeric(KECA)))]
kec_dprd_2004[, dapil := paste(KAB_NAME, DAPIL_NUMBER)]
kec_dprd_2004 = kec_dprd_2004[!is.na(DAPIL_NUMBER)]

#Unique kecamatan in 2004 dprd list
u_kec_dprd_2004 = unique(kec_dprd_2004[!is.na(id_kec), id_kec])

######################
#2009 dapil-kecamatan#
######################
kec_dprd_2009 = fread("./crosswalks/kecamatan_to_dprd2_2009.csv")

#Unique kecamatan in 2009 dprd list
u_kec_dprd_2009 = unique(kec_dprd_2009[!is.na(id_kec), id_kec])

################################
#World Bank violence data: NVMS#
################################

#List of world bank data files
wb_files = list.files('./NVMS', pattern = "\\.dta$")

#Read into list
wb_list = paste0("./NVMS/", wb_files) %>% 
  lapply(function(x) read.dta(x, convert.factors = F)) %>% 
  lapply(as.data.table)

#Combine data
wb_panel = rbindlist(wb_list, fill = T)

#Calculate dates/years
wb_panel[, date := as.Date(tanggal_kejadian, "%d/%m/%Y")]
wb_panel[, missing_kecamatan := 1*(kodebpskec1 %in% 0)]
wb_panel[, year := year(date)]

#Unique kecamatan from 2004 onwards
wb_kecamatan = wb_panel[missing_kecamatan %in% 0 & year >= 2004, list(kodebpskec1)] %>% unlist %>% unique


#######
#PODES#
#######

#2003#
######
podes_2003 = fread('./PODES/podes2003/PODES03A.csv')
podes_2003[, kec_code := paste0(sprintf('%02.f', PROP), sprintf('%02.f', KAB), sprintf('%03.f', KEC))]
podes_2003[, kab_code := paste0(sprintf('%02.f', PROP), sprintf('%02.f', KAB))]
podes_2003 = podes_2003[, list(kec_code)] %>% unlist %>% unique


#2008#
######
podes_2008 = fread('./PODES/podes2008/pds2008_d1_new.csv')
podes_2008[, kec_code := paste0(sprintf('%02.f', PROP), sprintf('%02.f', KAB), sprintf('%03.f', KEC))]
podes_2008[, kab_code := paste0(sprintf('%02.f', PROP), sprintf('%02.f', KAB))]
podes_2008 = podes_2008[, list(kec_code)] %>% unlist %>% unique

#2014#
######
podes_2014 = fread("./PODES/podes2014_correct/Podes_2014.csv")
podes_2014[, kec_code := paste0(sprintf('%02.f', R101), sprintf('%02.f', R102), sprintf('%03.f', R103))]
podes_2014[, kab_code := paste0(sprintf('%02.f', R101), sprintf('%02.f', R102))]
podes_2014 = podes_2014[, list(kec_code)] %>% unlist %>% unique


###################
#Fractionalization#
###################
fractionalization = fread('./census_2000_ethnicity/00-data/final/20190128_eth_rel_kec_data.csv')[, kec] %>% unique


##################
#Attitude Surveys#
##################

ifls = fread('./surveys/ifls_individual_data_07_14.csv')


ifls_2007 = ifls[year %in% 2007, kec_code] %>% unique

ifls_2014 = ifls[year %in% 2014, kec_code] %>% unique


################
#Desa crosswalk#
################

#Read in as list of state crosswalks
desa_crosswalks = list.files('./GIS_maps/desa_crosswalk') %>% as.list(.) %>%
  lapply(function(x) paste0('./GIS_maps/desa_crosswalk/',x)) %>%
  lapply(function(x) read.dbf(x, as.is = T)) %>%
  lapply(as.data.table)


###########################
###########################
##Functions for Crosswalk##
###########################
###########################

#Creates all observed sequences of kecamatan codes over time
clean_desa_crosswalk = function(dt, to_reference, from_reference) {
  
  #2 find id columns for each year 
  cols = names(dt)
  id_vars = cols[str_detect(cols, "^(ID)")]
  years = na.locf(str_extract(id_vars, "(?<=\\d\\d)(\\d\\d)|(\\d\\d$)"))
  years_full = c()
  for (y in 1:length(years)) {
    y_temp = as.numeric(years[y])
    if (y_temp>90) {years_full[y] = 1900 + y_temp}
    if (y_temp<90) {years_full[y] = 2000 + y_temp}
  }
  
  #Create new variables for each year
  year_idx = order(years_full)
  new_id_vars = paste("kec", years_full[year_idx], sequence(rle(years_full[year_idx])$lengths), sep = "_")
  
  #Extract kecamatan codes
  #unique paths
  dt[, u := 1:.N]
  #number of codes that are kecamatan in path
  dt[, kecamatan_count :=  lapply(.SD, function(x) str_detect(x, '(?<!0{3})0{3}$')) %>% unlist %>% sum(na.rm=T) , b= u , .SDcols = id_vars[year_idx]]
  #number of non missing code in path
  dt[, non_missing_count :=  lapply(.SD, function(x) !is.na(x)) %>% unlist %>% sum(na.rm=T) , b= u , .SDcols = id_vars[year_idx]]
  #percent kecematan codes out of non-missing in path
  dt[, kecamatan_pct := kecamatan_count/non_missing_count]
  
  #Create kecamatan variables
  dt[, (new_id_vars) := lapply(.SD, function(x) str_sub(x, start = 1L, end = 7L)), .SDcols = id_vars[year_idx]]
  
  #Best Kecamatan Paths
  dt_kec = dt[kecamatan_pct > 0, .SD, .SDcols = new_id_vars]
  
  #Check for missing kecamatan
  full_kec = dt_kec %>% unlist %>% unique %>% .[!is.na(.)]
  missing = setdiff(c(to_reference, from_reference), full_kec)
  
  dt_melt = melt(dt[kecamatan_pct == 0, .SD, .SDcols = c('u', new_id_vars)], id.vars = c('u'))
  dt_melt[, any_missing := any(value %in% missing), by = u]
  
  #If missing rows in remaining paths, add to kecamatan paths
  if (sum(dt_melt$any_missing) > 0 ) {
    dt_kec_missing = unique(dcast.data.table(dt_melt[any_missing %in% 1], u ~ variable, value.var = 'value'))
    dt_kec = rbindlist(list(dt_kec[, .SD, .SDcols = new_id_vars], 
                            dt_kec_missing[, .SD, .SDcols = new_id_vars]), 
                       use.names = T)
  }
  
  #Create all possible alternate paths (due to multiple kecamatan id columns per year)
  possible_sequences = expand.grid(split(new_id_vars, str_extract(new_id_vars, "(?<=kec_)\\d{4}(?=_\\d$)")))
  
  dt_kec_list = apply(possible_sequences, 1, function(x) dt_kec[, .SD, .SDcols = x])
  
  for (i in seq_along(dt_kec_list)) {
    dt_kec_list[[i]][, id := i]
  }
  
  #Count frequency of unique paths (within and across possible paths)
  dt_kec_all = rbindlist(dt_kec_list)
  dt_kec_all = dt_kec_all[, list(count = length(id)), by = names(dt_kec_all)]
  dt_kec_all = dt_kec_all[, list(count = sum(count)), by = setdiff(names(dt_kec_all), c('count', 'id'))]
  
  return(dt_kec_all)
}

# creates crosswalk between two lists of kecamatan
match_kecamatan = function(dt_list, to_reference, from_reference) {
  #Prepare for matching to reference
  to_reference_use = unique(to_reference) 
  from_reference_use = unique(from_reference)
  
  #Make draft kecamatan crosswalk
  cleaned_dt = dt_list %>% 
                lapply(., function(x) clean_desa_crosswalk(x, to_reference_use, from_reference_use)) %>% 
                rbindlist(fill = T)
  
  dt_kec_all = cleaned_dt[, list(count = sum(count)), by = setdiff(names(cleaned_dt), "count")]
    
  #check
  test_list_1 = dt_kec_all[, .SD, .SDcols = setdiff(names(dt_kec_all), 'count')] %>% unlist %>% unique %>% .[!is.na(.)]
  
  to_test_1 = sum(to_reference_use %in% test_list_1) / length(to_reference_use)
  from_test_1 = sum(from_reference_use %in% test_list_1) / length(from_reference_use)
  
  print(paste(to_test_1, ": fraction of target kecamatan in crosswalk"))
  print(paste(from_test_1, ": fraction of child kecamatan in crosswalk"))
            
  #Choose paths with maximum number of appearances that contain maximum possible set of reference kecamatan:
  counts = sort(unique(dt_kec_all$count), decreasing = T)
  
  find_count = data.table(count = counts, pct = rep(0, length(counts)))
  
  for (c in counts) {
    test_set = dt_kec_all[count >= c] %>% unlist %>% unique %>% .[!is.na(.)]
    test = setdiff(c(to_reference_use, from_reference_use), test_set)
    find_count[count %in% c, pct := 1 - (length(test) / length(c(to_reference_use, from_reference_use)))] 
  }
  
  c = find_count[pct %in% max(pct), count] %>% max
  dt_kec_use = dt_kec_all[count >= c]
  

  #Find children of "to" kecamatan
  dt_kec_use[, u := 1:.N]
  dt_kec_melt = melt(dt_kec_use, id.vars = c('u', 'count'))
  dt_kec_melt[, year := str_extract(variable, '(?<=kec_)\\d\\d\\d\\d(?=_1)')]
  
  #Which kecamatan are "to" kecamatan?
  dt_kec_melt[, use_to := ifelse(value %in% to_reference_use, 1, 0) ]
  #Find last appearing "to" kecamatan in path
  dt_kec_melt[, last_to_in_u := .SD[use_to == 1, tail(rle(value)$values, 1)], by = u]
  #What year is last appearance of last "to" kecamatan?
  dt_kec_melt[, last_to_in_u_year := .SD[use_to == 1, max(year[value %in% last_to_in_u]) ], by = u]
  
  #Check for number of final "to" kecamatan
  dt_kec_melt[, sum_to := length(unique(value[which(year >= last_to_in_u_year & use_to == 1)])), by = u]
  
  if (max(dt_kec_melt$sum_to) > 1) {
    print("Too many target kecamatan in sequence")
    break
  }
  
  #Keep only paths starting from "to" kecamatan
  to_paths = dt_kec_melt[sum_to %in% 1 & year >= last_to_in_u_year] %>%
              dcast.data.table(., u  + count + last_to_in_u ~ variable, value.var = 'value') %>%
              unique()
  
  #Find "best" parents
  #create all "to" parents and "from" children pairs
  to_paths_melt = melt(to_paths, id.vars = c('u', 'count', 'last_to_in_u')) %>% .[!is.na(value)]
  #which "to" parents are most common in the coding?
  to_paths_melt[, keep := last_to_in_u %in% last_to_in_u[which.max(count)], by = value] 
  to_paths_melt[, parent_count := length(unique(last_to_in_u[keep == T])) , by = value]
  
  #check parent count
  if (max(to_paths_melt$parent_count) > 1) {
    print ("Too many target parents for child kecamatan")
    break
  }
  
  #Keep best parents
  to_from_out = unique(to_paths_melt[keep == T, list(target_kecamatan = last_to_in_u, from_kecamatan = value)])
  
  #Check performance
  test_list = to_from_out %>% unlist %>% unique %>% .[!is.na(.)]
  
  to_test = sum(to_reference_use %in% test_list) / length(to_reference_use)
  from_test = sum(from_reference_use %in% test_list) / length(from_reference_use)

  print(paste(to_test, ": fraction of target kecamatan matched"))
  print(paste(from_test, ": fraction of child kecamatan matched"))
  
  return(to_from_out)
}


###################
###################
##Make crosswalks##
###################
###################

crosswalk_wb_to_dprd2004 = match_kecamatan(desa_crosswalks, u_kec_dprd_2004, wb_kecamatan)

crosswalk_wb_to_dprd2009 = match_kecamatan(desa_crosswalks, u_kec_dprd_2009, wb_kecamatan)

crosswalk_dprd2009_to_dprd2004 = match_kecamatan(desa_crosswalks, u_kec_dprd_2004, u_kec_dprd_2009)

crosswalk_podes2008_to_dprd_2004 = match_kecamatan(desa_crosswalks, u_kec_dprd_2004, podes_2008)

crosswalk_podes2014_to_dprd_2009 = match_kecamatan(desa_crosswalks, u_kec_dprd_2009, podes_2014)

crosswalk_attitudes_to_dprd_2004 = match_kecamatan(desa_crosswalks, u_kec_dprd_2004, attitudes)

crosswalk_attitudes_to_dprd_2009 = match_kecamatan(desa_crosswalks, u_kec_dprd_2009, attitudes)

crosswalk_ifls_to_dprd_2004 = match_kecamatan(desa_crosswalks, u_kec_dprd_2004, ifls_2007)

crosswalk_ifls_to_dprd_2009 = match_kecamatan(desa_crosswalks, u_kec_dprd_2009, ifls_2014)

crosswalk_dprd_2004_to_podes2003 = match_kecamatan(desa_crosswalks, podes_2003, u_kec_dprd_2004)

crosswalk_dprd_2009_to_podes2008 = match_kecamatan(desa_crosswalks, podes_2008, u_kec_dprd_2009)

crosswalk_dprd_2004_to_fractionalization = match_kecamatan(desa_crosswalks, fractionalization, u_kec_dprd_2009)
crosswalk_dprd_2009_to_fractionalization = match_kecamatan(desa_crosswalks, fractionalization, u_kec_dprd_2009)



write.csv(crosswalk_wb_to_dprd2004, './crosswalks/kecamatan_nvms_to_dprd2004.csv', row.names = F)
write.csv(crosswalk_wb_to_dprd2009, './crosswalks/kecamatan_nvms_to_dprd2009.csv', row.names = F)
write.csv(crosswalk_dprd2009_to_dprd2004, './crosswalks/kecamatan_dprd2009_to_dprd2004.csv', row.names = F)

write.csv(crosswalk_podes2008_to_dprd_2004, './crosswalks/kecamatan_podes2008_to_dprd2004.csv', row.names = F)
write.csv(crosswalk_podes2014_to_dprd_2009, './crosswalks/kecamatan_podes2014_to_dprd2009.csv', row.names = F)

write.csv(crosswalk_ifls_to_dprd_2004, './crosswalks/kecamatan_ifls_to_dprd2004.csv', row.names = F)
write.csv(crosswalk_ifls_to_dprd_2009, './crosswalks/kecamatan_ifls_to_dprd2009.csv', row.names = F)

write.csv(crosswalk_dprd_2004_to_podes2003, './crosswalks/kecamatan_dprd_2004_to_podes2003.csv', row.names = F)
write.csv(crosswalk_dprd_2009_to_podes2008, './crosswalks/kecamatan_dprd_2009_to_podes2008.csv', row.names = F)

write.csv(crosswalk_dprd_2004_to_fractionalization, './crosswalks/kecamatan_dprd_2004_to_fractionalization.csv', row.names = F)
write.csv(crosswalk_dprd_2009_to_fractionalization, './crosswalks/kecamatan_dprd_2009_to_fractionalization.csv', row.names = F)
